Data preparation combine table information and journals
None
Main table
papers_meta_analysis_new
Merged with:
Name
Github
from awsPy.aws_authorization import aws_connector
from awsPy.aws_s3 import service_s3
from awsPy.aws_glue import service_glue
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import os, shutil, json, re
from GoogleDrivePy.google_drive import connect_drive
from GoogleDrivePy.google_platform import connect_cloud_platform
from GoogleDrivePy.google_authorization import authorization_service
path = os.getcwd()
parent_path = str(Path(path).parent.parent)
name_credential = 'financial_dep_SO2_accessKeys.csv'
region = 'eu-west-2'
bucket = 'datalake-london'
path_cred = "{0}/creds/{1}".format(parent_path, name_credential)
con = aws_connector.aws_instantiate(credential = path_cred,
region = region)
client= con.client_boto()
s3 = service_s3.connect_S3(client = client,
bucket = bucket, verbose = True)
glue = service_glue.connect_glue(client = client)
pandas_setting = True
if pandas_setting:
cm = sns.light_palette("green", as_cmap=True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
Write query and save the CSV back in the S3 bucket datalake-datascience
DatabaseName = 'esg'
s3_output_example = 'SQL_OUTPUT_ATHENA'
query = """
WITH merge AS (
SELECT
id,
image,
row_id_excel,
table_refer,
row_id_google_spreadsheet,
incremental_id,
paper_name,
publication_year,
publication_type,
regexp_replace(
regexp_replace(
lower(publication_name),
'\&',
'and'
),
'\-',
' '
) as publication_name,
cnrs_ranking,
UPPER(peer_reviewed) as peer_reviewed,
UPPER(study_focused_on_social_environmental_behaviour) as study_focused_on_social_environmental_behaviour,
type_of_data,
CASE WHEN regions = 'ARAB WORLD' THEN 'WORLDWIDE' ELSE regions END AS regions,
CASE WHEN study_focusing_on_developing_or_developed_countries = 'Europe' THEN 'WORLDWIDE' ELSE UPPER(study_focusing_on_developing_or_developed_countries) END AS study_focusing_on_developing_or_developed_countries,
first_date_of_observations,
last_date_of_observations,
CASE WHEN first_date_of_observations >= 1997 THEN 'YES' ELSE 'NO' END AS kyoto,
CASE WHEN first_date_of_observations >= 2009 THEN 'YES' ELSE 'NO' END AS financial_crisis,
last_date_of_observations - first_date_of_observations as windows,
adjusted_model_name,
adjusted_model,
dependent,
adjusted_dependent,
independent,
adjusted_independent,
social,
environmental,
governance,
sign_of_effect,
target,
p_value_significant,
sign_positive,
sign_negative,
lag,
interaction_term,
quadratic_term,
n,
r2,
beta,
to_remove,
test_standard_error,
test_p_value,
test_t_value,
adjusted_standard_error,
adjusted_t_value
FROM
esg.papers_meta_analysis_new
LEFT JOIN (
SELECT
DISTINCT(title),
nr,
publication_year,
publication_type,
publication_name,
cnrs_ranking,
peer_reviewed,
study_focused_on_social_environmental_behaviour,
type_of_data,
study_focusing_on_developing_or_developed_countries
FROM
esg.papers_meta_analysis
) as old on papers_meta_analysis_new.id = old.nr
-- WHERE to_remove = 'TO_KEEP'
LEFT JOIN (
SELECT
nr,
CAST(MIN(first_date_of_observations) as int) as first_date_of_observations,
CAST(MAX(last_date_of_observations)as int) as last_date_of_observations,
min(row_id_excel) as row_id_excel
FROM
esg.papers_meta_analysis
GROUP BY nr
) as date_pub on papers_meta_analysis_new.id = date_pub.nr
LEFT JOIN (
SELECT
nr,
MIN(regions) as regions
FROM
(
SELECT
nr,
CASE WHEN regions_of_selected_firms in (
'Cameroon', 'Egypt', 'Libya', 'Morocco',
'Nigeria'
) THEN 'AFRICA' WHEN regions_of_selected_firms in ('GCC countries') THEN 'ARAB WORLD' WHEN regions_of_selected_firms in (
'India', 'Indonesia', 'Taiwan', 'Vietnam',
'Australia', 'China', 'Iran', 'Malaysia',
'Pakistan', 'South Korea', 'Bangladesh'
) THEN 'ASIA AND PACIFIC' WHEN regions_of_selected_firms in (
'Spain', '20 European countries',
'United Kingdom', 'France', 'Germany, Italy, the Netherlands and United Kingdom',
'Turkey', 'UK'
) THEN 'EUROPE' WHEN regions_of_selected_firms in ('Latin America', 'Brazil') THEN 'LATIN AMERICA' WHEN regions_of_selected_firms in ('USA', 'US', 'U.S.', 'Canada') THEN 'NORTH AMERICA' ELSE 'WORLDWIDE' END AS regions
FROM
papers_meta_analysis
)
GROUP BY
nr
) as reg on papers_meta_analysis_new.id = reg.nr
)
SELECT
to_remove,
id,
image,
row_id_excel,
row_id_google_spreadsheet,
table_refer,
incremental_id,
paper_name,
publication_name,
rank,
sjr,
sjr_best_quartile,
h_index,
total_docs_2020,
total_docs_3years,
total_refs,
total_cites_3years,
citable_docs_3years,
cites_doc_2years,
country ,
publication_year,
publication_type,
cnrs_ranking,
peer_reviewed,
study_focused_on_social_environmental_behaviour,
type_of_data,
regions,
study_focusing_on_developing_or_developed_countries,
first_date_of_observations,
last_date_of_observations - (windows/2) as mid_year,
last_date_of_observations,
kyoto,
financial_crisis,
windows,
adjusted_model_name,
adjusted_model,
dependent,
adjusted_dependent,
independent,
adjusted_independent,
social,
environmental,
governance,
sign_of_effect,
target,
p_value_significant,
sign_positive,
sign_negative,
lag,
interaction_term,
quadratic_term,
n,
r2,
beta,
test_standard_error,
test_p_value,
test_t_value,
adjusted_standard_error,
adjusted_t_value
FROM
merge
LEFT JOIN (
SELECT
rank,
regexp_replace(
regexp_replace(
lower(title),
'\&',
'and'
),
'\-',
' '
) as title,
sjr,
sjr_best_quartile,
h_index,
total_docs_2020,
total_docs_3years,
total_refs,
total_cites_3years,
citable_docs_3years,
cites_doc_2years,
country
FROM
"scimago"."journals_scimago"
WHERE sourceid not in (16400154787)
) as journal on merge.publication_name = journal.title
"""
output = (
s3.run_query(
query=query,
database=DatabaseName,
s3_output=s3_output_example,
filename='example_1',
dtype = {'publication_year':'string'}
)
.sort_values(by = ['id', 'first_date_of_observations'])
.drop_duplicates()
.assign(weight = lambda x: x.groupby(['id'])['id'].transform('size'))
)
output.head()
output.shape
output.describe()
output['weight'].describe()
#output[output.duplicated(subset = ['id', 'beta',
# 'true_standard_error', 'critical_value', 'lag', 'independent',
# 'true_t_value', 'true_stars', 'adjusted_model'
# ])].head()
Missing journals
output.loc[lambda x: x['rank'].isin([np.nan])]['publication_name'].unique()
Currently, the missing values come from the rows to check in METADATA_TABLES_COLLECTION
output.isna().sum().loc[lambda x: x> 0].sort_values()
def make_clickable(val):
return '<a href="{}">{}</a>'.format(val,val)
(
output
.loc[lambda x: x['first_date_of_observations'].isin([np.nan])]
.reindex(columns = ['paper_name', 'row_id_excel'])
.drop_duplicates()
.style
.format(make_clickable, subset = ['row_id_excel'])
)
(
output
.loc[lambda x: x['study_focusing_on_developing_or_developed_countries'].isin([np.nan])]
.reindex(columns = ['paper_name', 'row_id_excel'])
.drop_duplicates()
.style
.format(make_clickable, subset = ['row_id_excel'])
)
(
output
.loc[lambda x: x['peer_reviewed'].isin([np.nan])]
.reindex(columns = ['paper_name', 'row_id_excel'])
.drop_duplicates()
.style
.format(make_clickable, subset = ['row_id_excel'])
)
(
output
.loc[lambda x: x['adjusted_independent'].isin([np.nan])]
.reindex(columns = ['paper_name', 'row_id_google_spreadsheet'])
.drop_duplicates()
.head(5)
.style
.format(make_clickable, subset = ['row_id_google_spreadsheet'])
)
#!pip install --upgrade git+git://github.com/thomaspernet/GoogleDrive-python
try:
os.mkdir("creds")
except:
pass
s3.download_file(key = "CREDS/Financial_dependency_pollution/creds/token.pickle", path_local = "creds")
s3.download_file(key = "CREDS/Financial_dependency_pollution/creds/service.json", path_local = "creds")
auth = authorization_service.get_authorization(
path_credential_gcp=os.path.join(path, "creds", "service.json"),
path_credential_drive=os.path.join(path, "creds"),
verbose=False,
scope=['https://www.googleapis.com/auth/spreadsheets.readonly',
"https://www.googleapis.com/auth/drive"]
)
gd_auth = auth.authorization_drive(path_secret=os.path.join(
path, "creds", "credentials.json"))
service_account = auth.authorization_gcp()
drive = connect_drive.drive_operations(gd_auth)
import shutil
shutil.rmtree(os.path.join(path,"creds"))
FILENAME_SPREADSHEET = "METADATA_MODEL"
spreadsheet_id = drive.find_file_id(FILENAME_SPREADSHEET, to_print=False)
drive.add_data_to_spreadsheet(
data =output.fillna(""),
sheetID =spreadsheet_id,
sheetName = "MODEL_DATA",
detectRange = True,
rangeData = None)
During a presentation (Desir Seminar), it has been pointed out that characteristic of an author might impact the desire results. Most of the information are available from the internet.
We use two sources of information:
Using both data sourcs, we will retrieve or compute the following information:
The workflow works in three steps:
The first step of the workflow consists to train a basic LSTM architecture to deter the gender from family name.
Training the model requires the following steps:
project = 'valid-pagoda-132423'
gcp = connect_cloud_platform.connect_console(project = project,
service_account = service_account,
colab = False)
sql = """
SELECT
name,
gender,
COUNT(name) AS num_names
FROM
`bigquery-public-data.usa_names.usa_1910_current`
GROUP BY
name,
gender
"""
#names_df = client.query(sql).to_dataframe()
names_df = gcp.upload_data_from_bigquery(query = sql,location = "US")
names_df.shape
(35236, 3)
names_df.head()
| name | gender | num_names | |
|---|---|---|---|
| 0 | Mary | F | 5597 |
| 1 | Annie | F | 3994 |
| 2 | Anna | F | 5566 |
| 3 | Margaret | F | 5509 |
| 4 | Helen | F | 4879 |
For example, the preprocessing step does the following:
Take the name "mary", the character "m" is given the number 13, the character "a" is 1, and so one. The 0s are the padding because the matrix should have the same dimension
def preprocess(names_df,column, train=True, to_lower = True):
# Step 1: Lowercase
if to_lower:
names_df['name'] = names_df[column].str.lower()
else:
names_df['name'] = names_df[column]
# Step 2: Split individual characters
names_df['name'] = [list(name) for name in names_df['name']]
# Step 3: Pad names with spaces to make all names same length
name_length = 50
names_df['name'] = [
(name + [' ']*name_length)[:name_length]
for name in names_df['name']
]
# Step 4: Encode Characters to Numbers
names_df['name'] = [
[
max(0.0, ord(char)-96.0)
for char in name
]
for name in names_df['name']
]
if train:
# Step 5: Encode Gender to Numbers
names_df['gender'] = [
0.0 if gender=='F' else 1.0
for gender in names_df['gender']
]
return names_df
else:
return names_df['name']
names_df = preprocess(names_df, column = 'name', train=True)
names_df.head()
| name | gender | num_names | |
|---|---|---|---|
| 0 | [13.0, 1.0, 18.0, 25.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | 0.0 | 5597 |
| 1 | [1.0, 14.0, 14.0, 9.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | 0.0 | 3994 |
| 2 | [1.0, 14.0, 14.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | 0.0 | 5566 |
| 3 | [13.0, 1.0, 18.0, 7.0, 1.0, 18.0, 5.0, 20.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | 0.0 | 5509 |
| 4 | [8.0, 5.0, 12.0, 5.0, 14.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | 0.0 | 4879 |
Note: Embedding layer enables us to convert each word into a fixed length vector of defined size. The resultant vector is a dense one with having real values instead of just 0’s and 1’s. The fixed length of word vectors helps us to represent words in a better way along with reduced dimensions
#!pip install --upgrade tensorflow
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam
def lstm_model(num_alphabets=27, name_length=50, embedding_dim=256):
model = Sequential([
Embedding(num_alphabets, embedding_dim, input_length=name_length),
Bidirectional(LSTM(units=128, recurrent_dropout=0.2, dropout=0.2)),
Dense(1, activation="sigmoid")
])
model.compile(loss='binary_crossentropy',
optimizer=Adam(learning_rate=0.001),
metrics=['accuracy'])
return model
The idea is to turns positive integers (indexes) into dense vectors of fixed size. Then this layer can be used as the first layer in a model.
The size of the vocabulary (the list from the preprocessing) is equal to 27: The alphabet has 26 letters, and the space characters. We want the output layer to be a vector of 256 weights. input_length is the maximum size of the name. We can set it up since the length of input sequences is constant.
Here is an example of how the vector embedding output looks like
Note: In this example we have not trained the embedding layer. The weights assigned to the word vectors are initialized randomly.
model_ex = Sequential()
model_ex.add(Embedding(input_dim= 27, output_dim= 256, input_length=50))
model_ex.compile(loss = 'binary_crossentropy', metrics= 'accuracy')
output_array = model_ex.predict(names_df['name'].values.tolist())
output_array.shape
(35236, 50, 256)
The embedding vector for the first word is:
output_array[0].shape
(50, 256)
output_array[0]
array([[ 0.02053951, -0.02362924, -0.00785675, ..., -0.03053 ,
0.02258325, -0.00690504],
[-0.03187205, 0.01595006, 0.01843704, ..., -0.01692963,
-0.04387647, 0.02379021],
[-0.04373739, 0.01200284, 0.0097186 , ..., 0.04422157,
-0.03941666, 0.00994086],
...,
[ 0.01483032, 0.0451639 , -0.04539229, ..., -0.00638006,
-0.03302629, 0.03264472],
[ 0.01483032, 0.0451639 , -0.04539229, ..., -0.00638006,
-0.03302629, 0.03264472],
[ 0.01483032, 0.0451639 , -0.04539229, ..., -0.00638006,
-0.03302629, 0.03264472]], dtype=float32)
We’ll use the standard tensorflow.keras training pipeline as below
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
%time
# Step 1: Instantiate the model
model = lstm_model(num_alphabets=27, name_length=50, embedding_dim=256)
# Step 2: Split Training and Test Data
X = np.asarray(names_df['name'].values.tolist())
y = np.asarray(names_df['gender'].values.tolist())
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=0)
# Step 3: Train the model
callbacks = [
EarlyStopping(monitor='val_accuracy',
min_delta=1e-3,
patience=5,
mode='max',
restore_best_weights=True,
verbose=1),
]
history = model.fit(x=X_train,
y=y_train,
batch_size=64,
epochs=50,
validation_data=(X_test, y_test),
callbacks=callbacks)
# Step 4: Save the model
model.save('MODELS_AND_DATA/boyorgirl.h5')
# Step 5: Plot accuracies
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='val')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
To get the most information possible about an author, we need to rely on two differents data source:
Both data sources are accessible through an API.
However, Google scholar does not always return the author information, and the spelling is different in both data sources.
In this steps, we will begin with fetching data from Scemantic scholar, then Google scholar.
Our primary objective is to get the information about the gender, but also to evaluate the expertise of an author about ESG. The data source Semantic Scholar has 198,182,311 papers from all fields of science.
Our strategy is to use the API to search for a paper in order to get the related information (DOI, cite, performance) and more importantly, the ID of the author(s). Indeed, to get information about an author, we need to know his/her ID. As soon as we have the ID, we can collect and compute all other information (i.e. gender and expertise)
The workflow is the following:
from serpapi import GoogleSearch
from tqdm import tqdm
import time
import pickle
import re
from tensorflow.keras.models import load_model
import unicodedata
pred_model = load_model('MODELS_AND_DATA/boyorgirl.h5')
#!pip install google-search-results
FILENAME_SPREADSHEET = "CSR Excel File Meta-Analysis - Version 4 - 01.02.2021"
spreadsheet_id = drive.find_file_id(FILENAME_SPREADSHEET, to_print=False)
doi = drive.download_data_from_spreadsheet(
sheetID = spreadsheet_id,
sheetName = "Feuil1",
to_dataframe = True)
Get paper name
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}
field = [
"url",
"title",
"abstract",
"venue",
"year",
"referenceCount",
"citationCount",
"influentialCitationCount",
"isOpenAccess",
"fieldsOfStudy",
"authors"]
field_paper = [
"externalIds",
"url",
"title",
"abstract",
"venue",
"year",
"referenceCount",
"citationCount",
"influentialCitationCount",
"isOpenAccess",
"fieldsOfStudy",
"authors"
]
field_authors = [
"externalIds",
"url",
"name",
"aliases",
"affiliations",
"homepage",
"papers"
]
def find_doi(paper_name):
"""
to keep thing simple, assume first results in the best option
"""
paper_name_clean = (
paper_name
.lower()
.replace(" ", "+")
.replace(" ", "+")
.replace("\n", "+")
.replace(",", "+")
.replace("–", "")
.replace("++", "+")
.replace(":", "")
)
url_paper = 'https://api.semanticscholar.org/graph/v1/paper/search?query={}&fields={}'.format(
paper_name_clean, ",".join(field))
response_1 = requests.get(url_paper, headers=headers)
if response_1.status_code == 200:
response_1 = response_1.json()
if len(response_1['data']) > 0:
url_paper = "https://api.semanticscholar.org/graph/v1/paper/{}?fields={}".format(
response_1['data'][0]['paperId'], ",".join(field_paper))
response_2 = requests.get(url_paper, headers=headers)
if response_2.status_code == 200:
response_2 = response_2.json()
response_2['paper_name_source'] = paper_name
response_2['status'] = 'found'
### find authors details information
authors_fulls = []
for aut in response_1['data'][0]['authors']:
url_author = 'https://api.semanticscholar.org/graph/v1/author/{}?fields={}'.format(
aut['authorId'],
",".join(field_authors))
response_3 = requests.get(url_author, headers=headers)
if response_3.status_code == 200:
authors_fulls.append(response_3.json())
if len(authors_fulls) >0:
response_2['authors_detail'] = authors_fulls
return response_2
else:
return {'paper_name': paper_name, 'status': 'not found'}
else:
return {'paper_name': paper_name, 'status': 'not found'}
else:
return {'paper_name': paper_name, 'status': 'not found', 'status_code':response_1.status_code}
def clean_name(name='Sarah'):
"""
"""
return "".join(
(
c
for c in unicodedata.normalize("NFD", name)
if unicodedata.category(c) != "Mn"
)
).lower().replace("-", ' ')
def prediction_gender(name=["sarah"]):
"""
name should be normalised and a list of candidates
"""
return np.mean(pred_model.predict(
np.asarray(
preprocess(
pd.DataFrame(
name,
columns=['semantic_0']
), column="semantic_0", train=False
)
.values.tolist()
)
))
Below is an example with the following paper:
the authors are:
and the DOI is "10.1002/(SICI)1097-0266(199704)18:4303::AID-SMJ8693.0.CO;2-G"
response = find_doi(paper_name = list(doi['Title'].unique())[-2])
response['externalIds']['DOI']
'10.1002/(SICI)1097-0266(199704)18:4<303::AID-SMJ869>3.0.CO;2-G'
In the next steps, we want to predict the gender of the author. The first author is S. Waddock which is impossible to detect the gender because only one letter displays for the first name. Therefor, we will combine the first name with all the aliases. We add another constraint, the first name should have more than 2 characters:
Then we push all the candidates to the model, and return the average probability. The model gives an average probability of 43%, meaning the author is a female.
prediction_gender(
list(
dict.fromkeys(
[clean_name(name=
re.sub(r"[^a-zA-Z0-9]+", ' ', a
).split(" ")[0]) for a in
[response['authors_detail'][0]['name']] + response['authors_detail'][0]['aliases']
if len(a.split(" ")[0]) >2]
)
)
)
0.43727958
Get full list of information
list_paper_semantic = []
list_failure = []
for i, p in tqdm(enumerate(list(doi['Title'].unique()))):
time.sleep(15)
response = find_doi(paper_name = p)
if response['status'] == 'found':
list_paper_semantic.append(response)
else:
list_failure.append(p)
for i, authors in tqdm(enumerate(list_paper_semantic)):
for author in authors["authors_detail"]:
#### Clean authors
author_clean = [clean_name(name=author["name"].split(" ")[0])] if \
len(clean_name(name=author["name"].split(" ")[0]))>2 else None
if author["aliases"] is not None:
author_clean_alias = list(
dict.fromkeys(
[clean_name(name=
re.sub(r"[^a-zA-Z0-9]+", ' ', a
).split(" ")[0]) for a in author["aliases"] if len(a.split(" ")[0]) >2]
)
)
if author_clean is not None:
author_clean.extend(author_clean_alias)
else:
author_clean = author_clean_alias
#### predict gender
if len(author_clean) > 0:
max_prediction = prediction_gender(name=author_clean)
gender= "MALE" if max_prediction >=.5 else "FEMALE"
else:
max_prediction = None
gender = 'UNKNOWN'
author['gender'] = {'gender': gender, 'probability':max_prediction}
list_failure
Failure:
list_failure = ['The Effect of Corporate Social Responsibility on Financial Performance: Evidence from the Banking Industry in Emerging Economies',
'An examination of corporate social responsibility and financial performance: A study of the top 50 Indonesian listed corporations',
'Does it pay to be different? An analysis of the relationship between corporate social and financial performance (',
'Corporate Social and Environmental Performance and Their Relation to Financial Performance and Institutional Ownership: Empirical Evidence on Canadian Firms',
'The Corporate Social-Financial Performance Relationship: A Typology and Analysis']
for ind, paper in enumerate(list_paper_semantic):
with open("paper_id_{}".format(paper["paperId"]), "w") as outfile:
json.dump(eval(str(paper)), outfile)
s3.upload_file(
file_to_upload="paper_id_{}".format(paper["paperId"]),
destination_in_s3="DATA/JOURNALS/SEMANTIC_SCHOLAR/PAPERS",
)
os.remove("paper_id_{}".format(paper["paperId"]))
# Store data (serialize)
with open('MODELS_AND_DATA/list_paper_semantic.pickle', 'wb') as handle:
pickle.dump(list_paper_semantic, handle)
The strategy with Google Scholar is the same as Semantic scholar:
api_key = ""
def collect_doi_information(doi):
"""
"""
params = {
"engine": "google_scholar",
"q": doi,
"api_key": api_key
}
search = GoogleSearch(params)
results = search.get_dict()
list_authors = []
dic_title = {"search_parameters": results['search_parameters']}
if 'organic_results' in results:
dic_title['status'] = "FOUND"
# SEARCH TITLE
if "link" in results['organic_results'][0] or len(results['organic_results']) == 1:
i = 0
else:
i = 1
for key in [
"title",
"result_id",
"link",
"snippet",
"publication_info",
"cited_by",
]:
if key in results['organic_results'][i]:
dic_title[key] = results['organic_results'][i][key]
elif key in results['organic_results'][i]['inline_links']:
dic_title[key] = results['organic_results'][i]['inline_links'][key]
elif key in results['organic_results'][i]['publication_info']:
dic_title[key] = results['organic_results'][i]['publication_info'][key]
# SEARCH AUTHORS
if 'authors' in dic_title['publication_info']:
dic_title['authors_details'] = []
for i, author in enumerate(dic_title['publication_info']['authors']):
params = {
"engine": "google_scholar_author",
"author_id": author['author_id'],
"api_key": api_key
}
search = GoogleSearch(params)
results_auth = search.get_dict()
dic_authors = {
'search_parameters': results_auth['search_parameters'],
}
if "author" in results_auth:
dic_authors['status'] = 'FOUND'
dic_authors['author'] = results_auth['author']
#### detect Gender
author_clean = [clean_name(name=results_auth['author']['name'].split(" ")[0])] if \
len(clean_name(name=results_auth['author']['name'].split(" ")[0]))>2 else None
if author_clean is not None:
max_prediction = prediction_gender(name=author_clean)
gender= "MALE" if max_prediction >=.5 else "FEMALE"
else:
max_prediction = None
gender = 'UNKNOWN'
dic_authors['gender'] = {'gender': gender, 'probability':max_prediction}
else:
dic_authors['status'] = 'NOT_FOUND'
dic_authors['author'] = {'name':author['authors']}
dic_authors['gender'] = {'gender': 'UNKNOWN', 'probability':None}
dic_title['authors_details'].append(dic_authors)
list_authors.append(dic_authors)
else:
dic_title['status'] = "NOT_FOUND"
return dic_title, list_authors
list_papers_google = []
list_authors_google = []
list_paper_semantic = pickle.load( open( "MODELS_AND_DATA/list_paper_semantic.pickle", "rb" ) )
###
for i, d in tqdm(enumerate(list_paper_semantic)):
if "DOI" in d['externalIds']:
filename = d['paperId']
#### EXTRACT INFORMATION
dic_title, authors = collect_doi_information(d['externalIds']['DOI'])
list_papers_google.append(dic_title)
list_authors_google.extend(authors)
### PAPER
with open('paper_{}'.format(filename), 'w') as outfile:
json.dump(eval(str(dic_title)), outfile)
s3.upload_file(file_to_upload = 'paper_{}'.format(filename),
destination_in_s3 = "DATA/JOURNALS/GOOGLE_SCHOLAR/PAPERS")
os.remove('paper_{}'.format(filename))
### AUTHORS
for a in authors:
id_ = a['search_parameters']['author_id']
with open('authors_{}'.format(id_), 'w') as outfile:
json.dump(eval(str(a)), outfile)
s3.upload_file(file_to_upload = 'authors_{}'.format(id_),
destination_in_s3 = "DATA/JOURNALS/GOOGLE_SCHOLAR/AUTHORS")
os.remove('authors_{}'.format(id_))
len(list_paper_semantic)
len(list_papers_google)
len(list_authors_google)
with open('MODELS_AND_DATA/list_papers_google.pickle', 'wb') as handle:
pickle.dump(list_papers_google, handle)
with open('MODELS_AND_DATA/list_authors_google.pickle', 'wb') as handle:
pickle.dump(list_authors_google, handle)
The final step consists to bring together the paper informations with the authors informations. In the end, we want a table with all the authors information for a given paper. Therefore, the final table has the following dimension: number of papers x number of authors.
To construct the table, we need to proceed in three steps:
The data is saved in Google spreadsheet for validation purposes: AUTHOR_SEMANTIC_GOOGLE
We mentionned earlier that there is no direct link between the author name in Semantic scholar and Google scholar for two reasons. First of all, the reason comes from different spelling. Secondly, not all authors have information in Google scholar
We also need to construct the authors expertise in ESG.
In the part, we will reconstruct the list of author information from Semantic scholar and Google Scholar and construct the expertise following these steps:
import jellyfish
import requests
list_authors_google = pickle.load( open( "MODELS_AND_DATA/list_authors_google.pickle", "rb" ))
list_paper_semantic = pickle.load( open( "MODELS_AND_DATA/list_paper_semantic.pickle", "rb" ))
df_authors = (
pd.json_normalize(list_authors_google)
.assign(
interest=lambda x: x.apply(
lambda x: [
i["title"]
for i in x["author.interests"]
if x["author.interests"] != np.nan
]
if isinstance(x["author.interests"], list)
else np.nan,
axis=1,
),
email_extensition=lambda x: x["author.email"].str.replace(
"Verified email at ", "", regex=False
),
)
.reindex(
columns=[
"status",
"search_parameters.engine",
"search_parameters.author_id",
"search_parameters.hl",
"author.name",
"gender.gender",
"gender.probability",
"author.affiliations",
"author.email",
"author.website",
"author.interests",
"author.thumbnail",
"interest",
"email_extensition"
]
)
)
df_authors.tail(1)
| status | search_parameters.engine | search_parameters.author_id | search_parameters.hl | author.name | gender.gender | gender.probability | author.affiliations | author.email | author.website | author.interests | author.thumbnail | interest | email_extensition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 169 | FOUND | google_scholar_author | mLInqGIAAAAJ | en | Sandra Waddock | FEMALE | 0.050459 | Galligan Chair of Strategy, Boston College | Verified email at bc.edu | http://www2.bc.edu/sandra-waddock | [{'title': 'Management', 'link': 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:management', 'serpapi_link': 'https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Amanagement'}, {'title': 'Corporate Responsibility', 'link': 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:corporate_responsibility', 'serpapi_link': 'https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Acorporate_responsibility'}, {'title': 'Social Corporate responsibility', 'link': 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:social_corporate_responsibility', 'serpapi_link': 'https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Asocial_corporate_responsibility'}, {'title': 'management education', 'link': 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:management_education', 'serpapi_link': 'https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Amanagement_education'}, {'title': 'social partnerships', 'link': 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:social_partnerships', 'serpapi_link': 'https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Asocial_partnerships'}] | https://scholar.googleusercontent.com/citations?view_op=view_photo&user=mLInqGIAAAAJ&citpid=5 | [Management, Corporate Responsibility, Social Corporate responsibility, management education, social partnerships] | bc.edu |
pd.json_normalize(list_paper_semantic, "authors")["name"].nunique()
266
df_authors["author.name"].nunique()
146
To identify the same authors in the list of authors from Semantic scholar and Google scholar, we use the following sets of rules:
Perfect match
Un-perfect or disimilar match
We use the similarity API to compute the similarity score
api_key = ""
def twinword(token1, token2):
"""
"""
url = "https://api.twinword.com/api/text/similarity/latest/"
headers = {
'Host': "api.twinword.com",
"X-Twaip-Key":api_key
}
querystring = {
"text1": token1,
"text2": token2
}
response = requests.get(url, headers=headers,params=querystring)
return response.json()['similarity']
match_author = (
pd.DataFrame(
index=pd.MultiIndex.from_product(
[
pd.json_normalize(list_paper_semantic, "authors")["name"].unique(),
df_authors["author.name"].unique(),
],
names=["semantic", "google"],
)
)
.reset_index()
.apply(lambda x: x.str.lower())
.assign(
semantic=lambda x: x.apply(
lambda x: "".join(
(
c
for c in unicodedata.normalize("NFD", x["semantic"])
if unicodedata.category(c) != "Mn"
)
),
axis=1,
),
google=lambda x: x.apply(
lambda x: "".join(
(
c
for c in unicodedata.normalize("NFD", x["google"])
if unicodedata.category(c) != "Mn"
)
),
axis=1,
),
score_lev = lambda x: x.apply(lambda x: jellyfish.levenshtein_distance(x['semantic'], x['google']),axis =1),
min_score_lev = lambda x: x.groupby('google')['score_lev'].transform(min),
score_hamming= lambda x: x.apply(lambda x: jellyfish.hamming_distance(x['semantic'], x['google']),axis =1),
min_score_hamming = lambda x: x.groupby('google')['score_hamming'].transform(min)
)
)
Filter 1: Potential candidates
test = (
match_author
.loc[lambda x: x['score_lev'] == x['min_score_lev']]
.sort_values(by = ['min_score_lev', "semantic",'google'])
.assign(
similarity = lambda x: x.apply(lambda x: twinword(x['semantic'], x['google']), axis= 1),
max_similarity = lambda x: x.groupby('semantic')['similarity'].transform(max)
)
)
test.shape
(132, 8)
test.head()
| semantic | score_lev | min_score_lev | score_hamming | min_score_hamming | similarity | max_similarity | ||
|---|---|---|---|---|---|---|---|---|
| 37955 | abagail mcwilliams | abagail mcwilliams | 0 | 0 | 0 | 0 | 1.0 | 1.0 |
| 24622 | abraham lioui | abraham lioui | 0 | 0 | 0 | 0 | 1.0 | 1.0 |
| 17880 | amama shaukat | amama shaukat | 0 | 0 | 0 | 0 | 1.0 | 1.0 |
| 17146 | atif acıkgoz | atif acıkgoz | 0 | 0 | 0 | 0 | 1.0 | 1.0 |
| 146 | brahim gaies | brahim gaies | 0 | 0 | 0 | 0 | 1.0 | 1.0 |
test.tail()
| semantic | score_lev | min_score_lev | score_hamming | min_score_hamming | similarity | max_similarity | ||
|---|---|---|---|---|---|---|---|---|
| 25944 | v. crisostomo | vicente lima crisostomo | 11 | 11 | 22 | 18 | 0.400000 | 0.400000 |
| 9528 | erin h. c. kao | erin huichuan kao (高惠娟) | 12 | 12 | 16 | 16 | 0.500000 | 0.500000 |
| 5717 | r. danish | rizwan qaiser danish | 12 | 12 | 17 | 16 | 0.400000 | 0.400000 |
| 20813 | p. bansal | pratima (tima) bansal | 13 | 13 | 20 | 17 | 0.400000 | 0.400000 |
| 6304 | f. cappa | francesco cappa, ph.d. | 15 | 15 | 21 | 18 | 0.333333 | 0.333333 |
test['google'].nunique()
146
Filter 1: 124 exact match
test_1 = (
test
.assign(
size = lambda x: x.groupby('google')['google'].transform("size"),
)
.loc[lambda x: (x['size'] ==1) & (x['max_similarity'] >0.1)]
.loc[lambda x: (x['similarity'] ==x['max_similarity'])]
.sort_values(by = ['max_similarity', "semantic",'google'])
)
test_1['google'].nunique()
124
Test 2: 7 exact match
test_2 = (
test
.loc[lambda x: ~x['google'].isin(test_1['google'].unique())]
.loc[lambda x:(x['similarity'] >0)]
.assign(
size = lambda x: x.groupby('google')['google'].transform("size"),
min_score_hamming = lambda x: x.groupby('google')['score_hamming'].transform(min)
)
.loc[lambda x:(x['min_score_hamming'] == x['score_hamming'])]
)
test_2['google'].nunique()
7
We found 131 authors among the 266:
Semantic scholar provides the list all papers for any authors. In the previous steps, we saved this list.
For instance, the author Abderrahman Jahmane wrote 3 papers in his carreer
[{'paperId': '44af7948d66a4dc62952a863e957faaa5770d13c', 'title': 'Corporate social responsibility and firm value: Guiding through economic policy uncertainty'}, {'paperId': '57bf8e616da8230ca7a961be19affeb8b8ae619d', 'title': 'Corporate social responsibility, financial instability and corporate financial performance: Linear, non-linear and spillover effects – The case of the CAC 40 companies'}, {'paperId': 'eff6f21cc09c572f3bdc8add0d0f43badecbf977', 'title': 'Accounting for endogeneity and the dynamics of corporate social – Corporate financial performance relationship'}]
Source: https://www.semanticscholar.org/author/Abderrahmane-Jahmane/122677227
We will use the paper's title to flag whether it deals with ESG or not.
For the 266 authors, we have collected about 14,443 unique papers. We rely on a naive technique to flag all of the 14.443 papers.
The technique is the following:
In total 857 papers deals with ESG among the 14,443 papers (5.9%)
The last step to create the authors table concatenates the common authors from Semantic scholar and Google scholar with the authors not in Google scholar. When the concatenatation is done, we compute the expertise score as follow, within author:
The distribution of expertise:
The distribution of gender:
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from scipy.spatial import distance
#nltk.download('stopwords')
stop = stopwords.words('english')
def basic_clean(text):
return re.sub(r'[^\w\s]|[|]', '', text).split()
def dumb_search(items):
#return any(item in 'esg environmental social governance' for item in items)
return True if len([i for i in ['esg',"environmental","social","governance"] if i in items]) > 0 else False
def count_esg(papers):
"""
papers list with keys paperId and title
"""
return sum([all_connected_paper.loc[lambda x: x['paperId'].isin([item['paperId']])]['esg'].values[0] for item in papers])
def compute_cosine(entity_1, entity_2):
"""
entity_2: list of words to compare
"""
return [{'word':i, 'cosine': 1 - distance.cosine(
np.array(model_weights.loc[lambda x: x[0].isin([entity_1])].iloc[0,1:]),
np.array(model_weights.loc[lambda x: x[0].isin([i])].iloc[0,1:])
)} for i in entity_2 if not model_weights.loc[lambda x: x[0].isin([i])].empty]
list_papers = []
null = [list_papers.extend(i) for i in pd.json_normalize(list_paper_semantic, "authors_detail")['papers'].to_list()]
all_connected_paper = (
pd.DataFrame(list_papers)
.drop_duplicates()
.assign(
name_clean=lambda x:
x.apply(lambda x:
basic_clean(' '.join([word.lower() for word in x['title'].split() if word not in (stop)])), axis=1)
)
.assign(
esg = lambda x: x.apply(
lambda x:
dumb_search(x['name_clean']), axis = 1)
#compute_cosine(entity_1 = 'esg',entity_2 = x['name_clean']), axis = 1)
)
)
all_connected_paper.shape
(14443, 4)
len([i for i in ['esg',"environmental","social","governance"] if i in
[
"financial", "development", "really", "spur", "nascent", "entrepreneurship", "europe", "a", "panel", "data", "analysis"]
])
0
all_connected_paper.head()
| paperId | title | name_clean | esg | |
|---|---|---|---|---|
| 0 | 44af7948d66a4dc62952a863e957faaa5770d13c | Corporate social responsibility and firm value: Guiding through economic policy uncertainty | [corporate, social, responsibility, firm, value, guiding, economic, policy, uncertainty] | True |
| 1 | 57bf8e616da8230ca7a961be19affeb8b8ae619d | Corporate social responsibility, financial instability and corporate financial performance: Linear, non-linear and spillover effects – The case of the CAC 40 companies | [corporate, social, responsibility, financial, instability, corporate, financial, performance, linear, nonlinear, spillover, effects, the, case, cac, 40, companies] | True |
| 2 | eff6f21cc09c572f3bdc8add0d0f43badecbf977 | Accounting for endogeneity and the dynamics of corporate social – Corporate financial performance relationship | [accounting, endogeneity, dynamics, corporate, social, corporate, financial, performance, relationship] | True |
| 3 | 0fb6c1ff1c0f8dd2128f9c0f2bcac0460cd0cafd | Parenthood and Entrepreneurship: Are Both Possible? | [parenthood, entrepreneurship, are, both, possible] | False |
| 4 | 26b15080c857b06bf0dfdbf90e236b3c22d51180 | Does financial development really spur nascent entrepreneurship in Europe? — A panel data analysis | [does, financial, development, really, spur, nascent, entrepreneurship, europe, a, panel, data, analysis] | False |
all_connected_paper['esg'].value_counts(normalize = True)
False 0.855016 True 0.144984 Name: esg, dtype: float64
#!pip install --upgrade gensim
#%%time
#model = Word2Vec(sentences = all_connected_paper['name_clean'].tolist(),
# vector_size = 100,
# window = 5,
# min_count=5,
# sg = 0)
#model.wv.save_word2vec_format('word2vec_weights_100.txt', binary=False)
#list_header = ['Words'].extend(list(range(1, 101)))
#model_weights = pd.read_csv('word2vec_weights_100.txt',
# sep = ' ', skiprows= 1,
# header= list_header)
%%time
df_authors_full = (
pd.concat(
[
(
pd.concat([test_2, test_1])
.reindex(columns=["google", "semantic", "similarity"])
.merge(
(
df_authors.drop_duplicates(subset="author.name").assign(
google=lambda x: x.apply(
lambda x: "".join(
(
c
for c in unicodedata.normalize("NFD", x["author.name"])
if unicodedata.category(c) != "Mn"
)
),
axis=1,
).str.lower()
)
),
how="left",
on=["google"],
)
.assign(
size=lambda x: x.groupby("semantic")[
"semantic"].transform("size"),
max_similarity=lambda x: x.groupby(
"semantic")["similarity"].transform(max),
)
.loc[lambda x: x["max_similarity"] == x["similarity"]]
),
(
pd.json_normalize(list_paper_semantic, "authors_detail")
.assign(
name=lambda x:x['name'].str.lower(),
semantic=lambda x: x.apply(
lambda x: "".join(
(
c
for c in unicodedata.normalize("NFD", x["name"])
if unicodedata.category(c) != "Mn"
)
),
axis=1,
)
)
.drop_duplicates(subset=['name'])
.loc[
lambda x: ~x["semantic"].isin(
list(pd.concat([test_2, test_1])["semantic"])
)
]
.drop(columns=["name"])
),
],
axis=0,
)
.assign(
missing=lambda x: x['google'].isin([np.nan]),
)
.sort_values(by=['missing', 'semantic'])
.drop(columns=[
'url',
'aliases',
'affiliations',
'homepage',
'papers',
'externalIds.DBLP',
"authorId"
])
.merge(
(
pd.json_normalize(list_paper_semantic, "authors_detail")
.assign(
name=lambda x: x['name'].str.lower(),
semantic=lambda x: x.apply(
lambda x: "".join(
(
c
for c in unicodedata.normalize("NFD", x["name"])
if unicodedata.category(c) != "Mn"
)
),
axis=1,
)
)
.drop_duplicates(subset=['name'])
.drop(columns=['gender.gender', 'gender.probability'])
),
how='right',
on=['semantic']
)
.reindex(
columns=[
'missing',
'authorId',
'name',
'author.name',
'aliases',
'google',
'semantic',
'gender.gender',
'gender.probability',
'similarity',
'status',
'url',
'affiliations',
'author.affiliations',
'homepage',
'author.website',
'author.email',
'email_extensition',
'author.interests',
'interest',
'search_parameters.engine',
'search_parameters.author_id',
'search_parameters.hl',
'author.thumbnail',
'size',
'max_similarity',
'papers'
])
.assign(
total_paper = lambda x: x['papers'].str.len(),
esg = lambda x: x.apply(
lambda x:
count_esg(x['papers']), axis = 1),
pct_esg = lambda x: x['esg']/x['total_paper']
)
)
CPU times: user 10.3 s, sys: 11.1 ms, total: 10.3 s Wall time: 10.4 s
df_authors_full.head(1)
| missing | authorId | name | author.name | aliases | semantic | gender.gender | gender.probability | similarity | status | url | affiliations | author.affiliations | homepage | author.website | author.email | email_extensition | author.interests | interest | search_parameters.engine | search_parameters.author_id | search_parameters.hl | author.thumbnail | size | max_similarity | papers | total_paper | esg | pct_esg | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | True | 122677227 | abderrahmane jahmane | NaN | [Abderrahman Jahmane] | NaN | abderrahmane jahmane | MALE | 0.650541 | NaN | NaN | https://www.semanticscholar.org/author/122677227 | [] | NaN | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [{'paperId': '44af7948d66a4dc62952a863e957faaa5770d13c', 'title': 'Corporate social responsibility and firm value: Guiding through economic policy uncertainty'}, {'paperId': '57bf8e616da8230ca7a961be19affeb8b8ae619d', 'title': 'Corporate social responsibility, financial instability and corporate financial performance: Linear, non-linear and spillover effects – The case of the CAC 40 companies'}, {'paperId': 'eff6f21cc09c572f3bdc8add0d0f43badecbf977', 'title': 'Accounting for endogeneity and the dynamics of corporate social – Corporate financial performance relationship'}] | 3 | 3 | 1.0 |
df_authors_full['pct_esg'].describe()
count 266.000000 mean 0.222054 std 0.208118 min 0.000000 25% 0.088164 50% 0.166667 75% 0.302273 max 1.000000 Name: pct_esg, dtype: float64
df_authors_full['gender.gender'].value_counts()
MALE 179 FEMALE 82 UNKNOWN 5 Name: gender.gender, dtype: int64
df_authors_full.shape
(266, 30)
list_papers_google = pickle.load( open( "MODELS_AND_DATA/list_papers_google.pickle", "rb" ))
df_google_scholar = (
pd.json_normalize(list_papers_google)
.assign(
nb_authors_google=lambda x: x['publication_info.authors'].str.len()
)
)
df_paper_info_full = (
pd.json_normalize(list_paper_semantic, meta=["externalIds"]).rename(columns = {'authors_detail':'author_details_semantic'})
.drop(columns=["paper_name_source"])
.assign(nb_authors=lambda x: x["authors"].str.len())
.merge(
df_google_scholar.drop(columns=["title", "status"]).rename(columns = {'authors_details':'author_details_google'}),
how="left",
left_on=["externalIds.DOI"],
right_on=["search_parameters.q"],
)
.assign(missing_authors_info=lambda x: x["nb_authors"] != x["nb_authors_google"])
.reindex(
columns=[
"paperId",
"url",
"title",
"abstract",
"venue",
"year",
"nb_authors",
"nb_authors_google",
"missing_authors_info",
"authors",
"author_details_semantic",
"author_details_google",
"referenceCount",
"citationCount",
"cited_by.total",
"cited_by.link",
"cited_by.cites_id",
"cited_by.serpapi_scholar_link",
"influentialCitationCount",
"isOpenAccess",
"fieldsOfStudy",
"status",
"Levenshtein_score",
"externalIds.MAG",
"externalIds.DOI",
"externalIds.DBLP",
"result_id",
"link",
"snippet",
"search_parameters.engine",
"search_parameters.q",
"search_parameters.hl",
"publication_info.summary",
"publication_info.authors"
]
)
)
df_authors_journal_full = (
df_paper_info_full.assign(
authors_list=lambda x: x.apply(
lambda x: [i["name"]
for i in x["authors"] if x["authors"] != np.nan]
if isinstance(x["authors"], list)
else np.nan,
axis=1,
)
)
.explode("authors_list")
.assign(
authors_list=lambda x: x["authors_list"].str.lower(),
semantic=lambda x: x.apply(
lambda x: "".join(
(
c
for c in unicodedata.normalize("NFD", x["authors_list"])
if unicodedata.category(c) != "Mn"
)
),
axis=1,
),
)
.merge(df_authors_full, how="left", on=["semantic"])
)
df_authors_journal_full.shape
(309, 65)
FILENAME_SPREADSHEET = "AUTHOR_SEMANTIC_GOOGLE"
df_authors_journal_full.to_csv('AUTHOR_SEMANTIC_GOOGLE.csv', index = False)
drive.upload_file_root(mime_type = 'text/plain',
file_name = 'AUTHOR_SEMANTIC_GOOGLE.csv',
local_path = "AUTHOR_SEMANTIC_GOOGLE.csv"
)
drive.move_file(file_name = 'AUTHOR_SEMANTIC_GOOGLE.csv', folder_name = "SPREADSHEETS_ESG_METADATA")
Found file: SPREADSHEETS_ESG_METADATA (1GxZg6aF5H1JMfGulEoNCbLZVnJqSpSMh) Found file: AUTHOR_SEMANTIC_GOOGLE.csv (1Kk_BW_teNYXA_7NroyyJsTkNqMeov_fN) File AUTHOR_SEMANTIC_GOOGLE.csv moved to SPREADSHEETS_ESG_METADATA
{'file_name': 'AUTHOR_SEMANTIC_GOOGLE.csv',
'folder_name': 'SPREADSHEETS_ESG_METADATA'}
meta_analysis_esg_cfp¶Since the table to create has missing value, please use the following at the top of the query
CREATE TABLE database.table_name WITH (format = 'PARQUET') AS
Choose a location in S3 to save the CSV. It is recommended to save in it the datalake-datascience bucket. Locate an appropriate folder in the bucket, and make sure all output have the same format
s3_output = 'DATA/FINANCE/ESG/ESG_CFP'
table_name = 'meta_analysis_esg_cfp'
First, we need to delete the table (if exist)
try:
response = glue.delete_table(
database=DatabaseName,
table=table_name
)
print(response)
except Exception as e:
print(e)
{'ResponseMetadata': {'RequestId': '2bacf411-062c-4fea-b97c-6913d071ecca', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 25 Oct 2021 08:10:13 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '2', 'connection': 'keep-alive', 'x-amzn-requestid': '2bacf411-062c-4fea-b97c-6913d071ecca'}, 'RetryAttempts': 0}}
Clean up the folder with the previous csv file. Be careful, it will erase all files inside the folder
s3.remove_all_bucket(path_remove = s3_output)
True
%%time
query = """
-- CREATE TABLE {0}.{1} WITH (format = 'PARQUET') AS
WITH merge AS (
SELECT
id,
image,
row_id_excel,
table_refer,
row_id_google_spreadsheet,
incremental_id,
paper_name,
publication_year,
publication_type,
regexp_replace(
regexp_replace(
lower(publication_name),
'\&',
'and'
),
'\-',
' '
) as publication_name,
cnrs_ranking,
UPPER(peer_reviewed) as peer_reviewed,
UPPER(study_focused_on_social_environmental_behaviour) as study_focused_on_social_environmental_behaviour,
type_of_data,
CASE WHEN regions = 'ARAB WORLD' THEN 'WORLDWIDE' ELSE regions END AS regions,
CASE WHEN study_focusing_on_developing_or_developed_countries = 'Europe' THEN 'WORLDWIDE' ELSE UPPER(study_focusing_on_developing_or_developed_countries) END AS study_focusing_on_developing_or_developed_countries,
first_date_of_observations,
last_date_of_observations,
CASE WHEN first_date_of_observations >= 1997 THEN 'YES' ELSE 'NO' END AS kyoto,
CASE WHEN first_date_of_observations >= 2009 THEN 'YES' ELSE 'NO' END AS financial_crisis,
last_date_of_observations - first_date_of_observations as windows,
adjusted_model_name,
adjusted_model,
dependent,
adjusted_dependent,
independent,
adjusted_independent,
social,
environmental,
governance,
sign_of_effect,
target,
p_value_significant,
sign_positive,
sign_negative,
lag,
interaction_term,
quadratic_term,
n,
r2,
beta,
to_remove,
test_standard_error,
test_p_value,
test_t_value,
adjusted_standard_error,
adjusted_t_value
FROM
esg.papers_meta_analysis_new
LEFT JOIN (
SELECT
DISTINCT(title),
nr,
publication_year,
publication_type,
publication_name,
cnrs_ranking,
peer_reviewed,
study_focused_on_social_environmental_behaviour,
type_of_data,
study_focusing_on_developing_or_developed_countries
FROM
esg.papers_meta_analysis
) as old on papers_meta_analysis_new.id = old.nr
-- WHERE to_remove = 'TO_KEEP'
LEFT JOIN (
SELECT
nr,
CAST(MIN(first_date_of_observations) as int) as first_date_of_observations,
CAST(MAX(last_date_of_observations)as int) as last_date_of_observations,
min(row_id_excel) as row_id_excel
FROM
esg.papers_meta_analysis
GROUP BY nr
) as date_pub on papers_meta_analysis_new.id = date_pub.nr
LEFT JOIN (
SELECT
nr,
MIN(regions) as regions
FROM
(
SELECT
nr,
CASE WHEN regions_of_selected_firms in (
'Cameroon', 'Egypt', 'Libya', 'Morocco',
'Nigeria'
) THEN 'AFRICA' WHEN regions_of_selected_firms in ('GCC countries') THEN 'ARAB WORLD' WHEN regions_of_selected_firms in (
'India', 'Indonesia', 'Taiwan', 'Vietnam',
'Australia', 'China', 'Iran', 'Malaysia',
'Pakistan', 'South Korea', 'Bangladesh'
) THEN 'ASIA AND PACIFIC' WHEN regions_of_selected_firms in (
'Spain', '20 European countries',
'United Kingdom', 'France', 'Germany, Italy, the Netherlands and United Kingdom',
'Turkey', 'UK'
) THEN 'EUROPE' WHEN regions_of_selected_firms in ('Latin America', 'Brazil') THEN 'LATIN AMERICA' WHEN regions_of_selected_firms in ('USA', 'US', 'U.S.', 'Canada') THEN 'NORTH AMERICA' ELSE 'WORLDWIDE' END AS regions
FROM
papers_meta_analysis
)
GROUP BY
nr
) as reg on papers_meta_analysis_new.id = reg.nr
)
SELECT
to_remove,
id,
image,
row_id_excel,
row_id_google_spreadsheet,
table_refer,
incremental_id,
paper_name,
publication_name,
rank,
sjr,
sjr_best_quartile,
h_index,
total_docs_2020,
total_docs_3years,
total_refs,
total_cites_3years,
citable_docs_3years,
cites_doc_2years,
country ,
publication_year,
publication_type,
cnrs_ranking,
peer_reviewed,
study_focused_on_social_environmental_behaviour,
type_of_data,
regions,
study_focusing_on_developing_or_developed_countries,
first_date_of_observations,
last_date_of_observations - (windows/2) as mid_year,
last_date_of_observations,
kyoto,
financial_crisis,
windows,
adjusted_model_name,
adjusted_model,
dependent,
adjusted_dependent,
independent,
adjusted_independent,
social,
environmental,
governance,
sign_of_effect,
target,
p_value_significant,
sign_positive,
sign_negative,
lag,
interaction_term,
quadratic_term,
n,
r2,
beta,
test_standard_error,
test_p_value,
test_t_value,
adjusted_standard_error,
adjusted_t_value
FROM
merge
LEFT JOIN (
SELECT
rank,
regexp_replace(
regexp_replace(
lower(title),
'\&',
'and'
),
'\-',
' '
) as title,
sjr,
sjr_best_quartile,
h_index,
total_docs_2020,
total_docs_3years,
total_refs,
total_cites_3years,
citable_docs_3years,
cites_doc_2years,
country
FROM
"scimago"."journals_scimago"
WHERE sourceid not in (16400154787)
) as journal on merge.publication_name = journal.title
""".format(DatabaseName, table_name)
output = s3.run_query(
query=query,
database=DatabaseName,
s3_output=s3_output_example,
filename = "temp"
)
output.head()
CPU times: user 220 ms, sys: 11.6 ms, total: 232 ms Wall time: 1.9 s
| to_remove | id | image | row_id_excel | row_id_google_spreadsheet | table_refer | incremental_id | paper_name | publication_name | rank | sjr | sjr_best_quartile | h_index | total_docs_2020 | total_docs_3years | total_refs | total_cites_3years | citable_docs_3years | cites_doc_2years | country | publication_year | publication_type | cnrs_ranking | peer_reviewed | study_focused_on_social_environmental_behaviour | type_of_data | regions | study_focusing_on_developing_or_developed_countries | first_date_of_observations | mid_year | last_date_of_observations | kyoto | financial_crisis | windows | adjusted_model_name | adjusted_model | dependent | adjusted_dependent | independent | adjusted_independent | social | environmental | governance | sign_of_effect | target | p_value_significant | sign_positive | sign_negative | lag | interaction_term | quadratic_term | n | r2 | beta | test_standard_error | test_p_value | test_t_value | adjusted_standard_error | adjusted_t_value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | TO_KEEP | 1 | https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=479654123&range=B2 | https://docs.google.com/spreadsheets/d/11A3l50yfiGxxRuyV-f3WV9Z-4DcsQLYW6XBl4a7U4bQ/edit#gid=18346659&range=B10 | https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=899172650&range=B2 | table 4 | 1 | Corporate social responsibility financial instability and corporate | finance research letters | 2728.0 | 1.339 | Q1 | 39.0 | 442.0 | 567.0 | 11373.0 | 2909.0 | 566.0 | 5.47 | Netherlands | 2020.0 | Journal Article | 3.0 | YES | ENVIRONMENTAL SOCIAL AND GOVERNANCE | Panel data | EUROPE | DEVELOPED | 2002.0 | 2010.0 | 2017.0 | YES | NO | 15.0 | GMM | OTHER | ROA | ROA | CSR | CSR | YES | NO | NO | INSIGNIFICANT | NOT_SIGNIFICANT | INSIGNIFICANT | NO | NO | NO | NO | NO | 469.0 | 0.108 | 0.0122 | OK | NO_PV | NO_TV | 0.0135 | 0.904 |
| 1 | TO_KEEP | 1 | https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=479654123&range=B2 | https://docs.google.com/spreadsheets/d/11A3l50yfiGxxRuyV-f3WV9Z-4DcsQLYW6XBl4a7U4bQ/edit#gid=18346659&range=B10 | https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=899172650&range=B3 | table 4 | 2 | Corporate social responsibility financial instability and corporate | finance research letters | 2728.0 | 1.339 | Q1 | 39.0 | 442.0 | 567.0 | 11373.0 | 2909.0 | 566.0 | 5.47 | Netherlands | 2020.0 | Journal Article | 3.0 | YES | ENVIRONMENTAL SOCIAL AND GOVERNANCE | Panel data | EUROPE | DEVELOPED | 2002.0 | 2010.0 | 2017.0 | YES | NO | 15.0 | GMM | OTHER | ROA | ROA | CSR | CSR | YES | NO | NO | POSITIVE | SIGNIFICANT | 1_PERCENT | YES | NO | NO | NO | NO | 469.0 | 0.198 | 1.2115 | OK | NO_PV | NO_TV | 0.1581 | 7.663 |
| 2 | TO_KEEP | 1 | https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=479654123&range=B2 | https://docs.google.com/spreadsheets/d/11A3l50yfiGxxRuyV-f3WV9Z-4DcsQLYW6XBl4a7U4bQ/edit#gid=18346659&range=B10 | https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=899172650&range=B4 | table 4 | 3 | Corporate social responsibility financial instability and corporate | finance research letters | 2728.0 | 1.339 | Q1 | 39.0 | 442.0 | 567.0 | 11373.0 | 2909.0 | 566.0 | 5.47 | Netherlands | 2020.0 | Journal Article | 3.0 | YES | ENVIRONMENTAL SOCIAL AND GOVERNANCE | Panel data | EUROPE | DEVELOPED | 2002.0 | 2010.0 | 2017.0 | YES | NO | 15.0 | GMM | OTHER | ROE | ROE | CSR | CSR | YES | NO | NO | POSITIVE | SIGNIFICANT | 1_PERCENT | YES | NO | NO | NO | NO | 469.0 | 0.857 | 0.3959 | OK | NO_PV | NO_TV | 0.0800 | 4.949 |
| 3 | TO_KEEP | 1 | https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=479654123&range=B2 | https://docs.google.com/spreadsheets/d/11A3l50yfiGxxRuyV-f3WV9Z-4DcsQLYW6XBl4a7U4bQ/edit#gid=18346659&range=B10 | https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=899172650&range=B5 | table 4 | 4 | Corporate social responsibility financial instability and corporate | finance research letters | 2728.0 | 1.339 | Q1 | 39.0 | 442.0 | 567.0 | 11373.0 | 2909.0 | 566.0 | 5.47 | Netherlands | 2020.0 | Journal Article | 3.0 | YES | ENVIRONMENTAL SOCIAL AND GOVERNANCE | Panel data | EUROPE | DEVELOPED | 2002.0 | 2010.0 | 2017.0 | YES | NO | 15.0 | GMM | OTHER | ROE | ROE | CSR | CSR | YES | NO | NO | POSITIVE | SIGNIFICANT | 1_PERCENT | YES | NO | NO | NO | NO | 469.0 | 0.375 | 6.8766 | OK | NO_PV | NO_TV | 0.5481 | 12.546 |
| 4 | TO_KEEP | 1 | https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=479654123&range=B2 | https://docs.google.com/spreadsheets/d/11A3l50yfiGxxRuyV-f3WV9Z-4DcsQLYW6XBl4a7U4bQ/edit#gid=18346659&range=B10 | https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=899172650&range=B6 | table 4 | 5 | Corporate social responsibility financial instability and corporate | finance research letters | 2728.0 | 1.339 | Q1 | 39.0 | 442.0 | 567.0 | 11373.0 | 2909.0 | 566.0 | 5.47 | Netherlands | 2020.0 | Journal Article | 3.0 | YES | ENVIRONMENTAL SOCIAL AND GOVERNANCE | Panel data | EUROPE | DEVELOPED | 2002.0 | 2010.0 | 2017.0 | YES | NO | 15.0 | GMM | OTHER | TobinQ | TOBIN_Q | CSR | CSR | YES | NO | NO | NEGATIVE | SIGNIFICANT | 10_PERCENT | NO | YES | NO | NO | NO | 467.0 | 0.539 | -0.5037 | OK | NO_PV | NO_TV | 0.2497 | -2.017 |
output.shape
(701, 59)
Use Semantic scholar to find ID
def find_id(paper_name):
"""
to keep thing simple, assume first results in the best option
"""
paper_name_clean = (
paper_name
.lower()
.replace(" ", "+")
.replace(" ", "+")
.replace("\n", "+")
.replace(",", "+")
.replace("–", "")
.replace("++", "+")
.replace(":", "")
)
url_paper = 'https://api.semanticscholar.org/graph/v1/paper/search?query={}&fields={}'.format(
paper_name_clean, ",".join(field))
response_1 = requests.get(url_paper, headers=headers)
if response_1.status_code == 200:
response_1 = response_1.json()
if len(response_1['data']) > 0:
url_paper = "https://api.semanticscholar.org/graph/v1/paper/{}?fields={}".format(
response_1['data'][0]['paperId'], ",".join(field_paper))
response_2 = requests.get(url_paper, headers=headers)
return {'paper_name': paper_name, 'paperId':response_2.json()['paperId']}
find_list = False
if find_list:
list_ids = []
failure = []
for p in tqdm(list(output['paper_name'].unique())):
time.sleep(5)
try:
list_ids.append(find_id(p))
except:
failure.append(p)
(df_authors_journal_full.groupby(["title", 'paperId'])
.agg(
{
"nb_authors": "max",
"referenceCount": "sum",
"citationCount": "sum",
"cited_by.total": "sum",
"isOpenAccess": "min",
"total_paper": "sum",
"esg": "sum",
}
)
)
| nb_authors | referenceCount | citationCount | cited_by.total | isOpenAccess | total_paper | esg | ||
|---|---|---|---|---|---|---|---|---|
| title | paperId | |||||||
| L’impact de la responsabilité sociale (RSE) sur la performance financière des entreprises (PFE) au Cameroun | 732b67567b0ab51ca047fa0f3ebc89de29bbc8a4 | 1 | 9 | 1 | 9.0 | False | 2 | 0 |
| RSE et performance financière : une approche par la communication des entreprises | 0bf6400dcc8d2a9c1b02c650cc8e0ebfedf99670 | 2 | 36 | 8 | 0.0 | True | 24 | 0 |
| Y a-t-il un impact de la RSE sur la performance financière de l'entreprise : Etude empirique sur les sociétés marocaines cotées à la bourse de Casablanca | fea756f43c5bebc539763ec247e3fe23c19c3d36 | 2 | 66 | 6 | 0.0 | False | 11 | 0 |
(
df_authors_journal_full.groupby(["paperId"])['gender.gender'].value_counts()
.unstack(-1)
.fillna(0)
.assign(
total = lambda x: x.sum(axis=1),
pct_female = lambda x: x['FEMALE']/x['total']
)
.reset_index()
.drop(columns = ['total'])
)
| gender.gender | paperId | FEMALE | MALE | UNKNOWN | pct_female |
|---|---|---|---|---|---|
| 0 | 02281aebff7110c8b6efb59ebba448ecb7e2a4cc | 2.0 | 1.0 | 0.0 | 0.666667 |
| 1 | 0bf6400dcc8d2a9c1b02c650cc8e0ebfedf99670 | 0.0 | 2.0 | 0.0 | 0.000000 |
| 2 | 0ee4d664b5bb34e12ef2de90c8c45eaf2807449f | 0.0 | 2.0 | 1.0 | 0.000000 |
| 3 | 1269b93b9569db9e0732717734920f4d22e0cf2f | 1.0 | 0.0 | 0.0 | 1.000000 |
| 4 | 128fd0154eeaf6189fcff693abbd076aad42b900 | 1.0 | 1.0 | 0.0 | 0.500000 |
| ... | ... | ... | ... | ... | ... |
| 101 | f7740cf68307ed71e418b796af3a706cdbd15394 | 2.0 | 2.0 | 0.0 | 0.500000 |
| 102 | f9429b50ef05bfe89fb558d48b187d92233b2a37 | 0.0 | 2.0 | 0.0 | 0.000000 |
| 103 | fa616b7be943b50b1cbab912fcc636389334f1e8 | 2.0 | 1.0 | 0.0 | 0.666667 |
| 104 | fea756f43c5bebc539763ec247e3fe23c19c3d36 | 0.0 | 2.0 | 0.0 | 0.000000 |
| 105 | ff826659b5b1136f2e49fc9e731158b92887d7ed | 0.0 | 2.0 | 0.0 | 0.000000 |
106 rows × 5 columns
df_final = (
df_authors_journal_full.groupby(["paperId"])
.agg(
{
"nb_authors": "max",
"referenceCount": "sum",
"citationCount": "sum",
"cited_by.total": "sum",
"isOpenAccess": "min",
"total_paper": "sum",
"esg": "sum",
}
)
.assign(pct_esg=lambda x: x["esg"] / x["total_paper"])
.reset_index()
.loc[lambda x: x['esg'] != 0]
.merge(pd.DataFrame([i for i in list_ids if i]), on=["paperId"])
.merge(
(
df_authors_journal_full.groupby(
["paperId"])['gender.gender'].value_counts()
.unstack(-1)
.fillna(0)
.assign(
total=lambda x: x.sum(axis=1),
pct_female=lambda x: x['FEMALE']/x['total']
)
.reset_index()
.drop(columns=['total'])
), on=['paperId']
)
.merge(output, on=["paper_name"])
.rename(columns={
'cited_by.total': 'cited_by_total',
'referenceCount': 'reference_count',
'citationCount': 'citation_count',
'isOpenAccess': 'is_open_access',
'cited_by.total': 'cited_by_total',
'FEMALE':'female',
'MALE':'male',
'UNKNOWN':'unknown'
})
)
df_final.shape
(641, 72)
df_final.head(1)
| paperId | nb_authors | reference_count | citation_count | cited_by_total | is_open_access | total_paper | esg | pct_esg | paper_name | female | male | unknown | pct_female | to_remove | id | image | row_id_excel | row_id_google_spreadsheet | table_refer | incremental_id | publication_name | rank | sjr | sjr_best_quartile | h_index | total_docs_2020 | total_docs_3years | total_refs | total_cites_3years | citable_docs_3years | cites_doc_2years | country | publication_year | publication_type | cnrs_ranking | peer_reviewed | study_focused_on_social_environmental_behaviour | type_of_data | regions | study_focusing_on_developing_or_developed_countries | first_date_of_observations | mid_year | last_date_of_observations | kyoto | financial_crisis | windows | adjusted_model_name | adjusted_model | dependent | adjusted_dependent | independent | adjusted_independent | social | environmental | governance | sign_of_effect | target | p_value_significant | sign_positive | sign_negative | lag | interaction_term | quadratic_term | n | r2 | beta | test_standard_error | test_p_value | test_t_value | adjusted_standard_error | adjusted_t_value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 02281aebff7110c8b6efb59ebba448ecb7e2a4cc | 3 | 303 | 1143 | 1743.0 | False | 198 | 48 | 0.242424 | How Corporate Social Responsibility Engagement Strategy Moderates the CSR–Financial Performance Relationship | 2.0 | 1.0 | 0.0 | 0.666667 | TO_KEEP | 64 | https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=479654123&range=B91 | https://docs.google.com/spreadsheets/d/11A3l50yfiGxxRuyV-f3WV9Z-4DcsQLYW6XBl4a7U4bQ/edit#gid=18346659&range=B814 | https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=899172650&range=B470 | table 2 | 469 | journal of management studies | 347.0 | 4.398 | Q1 | 184.0 | 95.0 | 160.0 | 7792.0 | 1265.0 | 147.0 | 6.84 | United Kingdom | 2012.0 | Journal Article | 1.0 | YES | ENVIRONMENTAL SOCIAL AND GOVERNANCE | Panel data | WORLDWIDE | WORLDWIDE | 1998.0 | 2003.0 | 2007.0 | YES | NO | 9.0 | HAUSMAN–TAYLOR MODELLING | OTHER | ROA | ROA | CSR | CSR | YES | NO | NO | POSITIVE | SIGNIFICANT | 10_PERCENT | YES | NO | NO | NO | NO | 1300.0 | 0.2401 | 0.15 | OK | NO_PV | NO_TV | 0.08 | 1.875 |
input_path = 'df_esg_metaanalysis.csv'
df_final.to_csv(input_path, index=False)
# SAVE S3
s3.upload_file(input_path, s3_output)
schema = [
{'Name': 'index', 'Type': 'int', 'Comments': ''},
{'Name': 'paperId', 'Type': 'string', 'Comments': ''},
{'Name': 'nb_authors', 'Type': 'int', 'Comments': ''},
{'Name': 'reference_count', 'Type': 'int', 'Comments': ''},
{'Name': 'citation_count', 'Type': 'int', 'Comments': ''},
{'Name': 'cited_by_total', 'Type': 'int', 'Comments': ''},
{'Name': 'is_open_access', 'Type': 'boolean', 'Comments': ''},
{'Name': 'total_paper', 'Type': 'int', 'Comments': ''},
{'Name': 'esg', 'Type': 'int', 'Comments': ''},
{'Name': 'pct_esg', 'Type': 'float', 'Comments': ''},
{'Name': 'paper_name', 'Type': 'string', 'Comments': ''},
{'Name': 'female', 'Type': 'float', 'Comments': ''},
{'Name': 'male', 'Type': 'float', 'Comments': ''},
{'Name': 'unknown', 'Type': 'float', 'Comments': ''},
{'Name': 'pct_female', 'Type': 'float', 'Comments': ''},
{'Name': 'to_remove', 'Type': 'string', 'Comments': ''},
{'Name': 'id', 'Type': 'int', 'Comments': ''},
{'Name': 'image', 'Type': 'string', 'Comments': ''},
{'Name': 'row_id_excel', 'Type': 'string', 'Comments': ''},
{'Name': 'row_id_google_spreadsheet', 'Type': 'string', 'Comments': ''},
{'Name': 'table_refer', 'Type': 'string', 'Comments': ''},
{'Name': 'incremental_id', 'Type': 'int', 'Comments': ''},
{'Name': 'publication_name', 'Type': 'string', 'Comments': ''},
{'Name': 'rank', 'Type': 'int', 'Comments': ''},
{'Name': 'sjr', 'Type': 'int', 'Comments': ''},
{'Name': 'sjr_best_quartile', 'Type': 'string', 'Comments': ''},
{'Name': 'h_index', 'Type': 'int', 'Comments': ''},
{'Name': 'total_docs_2020', 'Type': 'int', 'Comments': ''},
{'Name': 'total_docs_3years', 'Type': 'int', 'Comments': ''},
{'Name': 'total_refs', 'Type': 'int', 'Comments': ''},
{'Name': 'total_cites_3years', 'Type': 'int', 'Comments': ''},
{'Name': 'citable_docs_3years', 'Type': 'int', 'Comments': ''},
{'Name': 'cites_doc_2years', 'Type': 'int', 'Comments': ''},
{'Name': 'country', 'Type': 'string', 'Comments': ''},
{'Name': 'publication_year', 'Type': 'int', 'Comments': ''},
{'Name': 'publication_type', 'Type': 'string', 'Comments': ''},
{'Name': 'cnrs_ranking', 'Type': 'int', 'Comments': ''},
{'Name': 'peer_reviewed', 'Type': 'string', 'Comments': ''},
{'Name': 'study_focused_on_social_environmental_behaviour', 'Type': 'string', 'Comments': ''},
{'Name': 'type_of_data', 'Type': 'string', 'Comments': ''},
{'Name': 'regions', 'Type': 'string', 'Comments': ''},
{'Name': 'study_focusing_on_developing_or_developed_countries', 'Type': 'string', 'Comments': ''},
{'Name': 'first_date_of_observations', 'Type': 'int', 'Comments': ''},
{'Name': 'mid_year', 'Type': 'int', 'Comments': ''},
{'Name': 'last_date_of_observations', 'Type': 'int', 'Comments': ''},
{'Name': 'kyoto', 'Type': 'string', 'Comments': ''},
{'Name': 'financial_crisis', 'Type': 'string', 'Comments': ''},
{'Name': 'windows', 'Type': 'int', 'Comments': ''},
{'Name': 'adjusted_model_name', 'Type': 'string', 'Comments': ''},
{'Name': 'adjusted_model', 'Type': 'string', 'Comments': ''},
{'Name': 'dependent', 'Type': 'string', 'Comments': ''},
{'Name': 'adjusted_dependent', 'Type': 'string', 'Comments': ''},
{'Name': 'independent', 'Type': 'string', 'Comments': ''},
{'Name': 'adjusted_independent', 'Type': 'string', 'Comments': ''},
{'Name': 'social', 'Type': 'string', 'Comments': ''},
{'Name': 'environmental', 'Type': 'string', 'Comments': ''},
{'Name': 'governance', 'Type': 'string', 'Comments': ''},
{'Name': 'sign_of_effect', 'Type': 'string', 'Comments': ''},
{'Name': 'target', 'Type': 'string', 'Comments': ''},
{'Name': 'p_value_significant', 'Type': 'string', 'Comments': ''},
{'Name': 'sign_positive', 'Type': 'string', 'Comments': ''},
{'Name': 'sign_negative', 'Type': 'string', 'Comments': ''},
{'Name': 'lag', 'Type': 'string', 'Comments': ''},
{'Name': 'interaction_term', 'Type': 'string', 'Comments': ''},
{'Name': 'quadratic_term', 'Type': 'string', 'Comments': ''},
{'Name': 'n', 'Type': 'int', 'Comments': ''},
{'Name': 'r2', 'Type': 'int', 'Comments': ''},
{'Name': 'beta', 'Type': 'int', 'Comments': ''},
{'Name': 'test_standard_error', 'Type': 'string', 'Comments': ''},
{'Name': 'test_p_value', 'Type': 'string', 'Comments': ''},
{'Name': 'test_t_value', 'Type': 'string', 'Comments': ''},
{'Name': 'adjusted_standard_error', 'Type': 'int', 'Comments': ''},
{'Name': 'adjusted_t_value', 'Type': 'int', 'Comments': ''}
]
glue = service_glue.connect_glue(client=client)
target_S3URI = os.path.join("s3://",bucket, s3_output)
name_crawler = "crawl-industry-name"
Role = 'arn:aws:iam::468786073381:role/AWSGlueServiceRole-crawler-datalake'
DatabaseName = "esg"
TablePrefix = 'meta_analysis_' # add "_" after prefix, ex: hello_
glue.create_table_glue(
target_S3URI,
name_crawler,
Role,
DatabaseName,
TablePrefix,
from_athena=False,
update_schema=schema,
)
{'Name': 'meta_analysis_esg_cfp',
'DatabaseName': 'esg',
'Owner': 'owner',
'CreateTime': datetime.datetime(2021, 10, 25, 9, 58, 25, tzinfo=tzlocal()),
'UpdateTime': datetime.datetime(2021, 10, 25, 9, 58, 29, tzinfo=tzlocal()),
'LastAccessTime': datetime.datetime(2021, 10, 25, 9, 58, 25, tzinfo=tzlocal()),
'Retention': 0,
'StorageDescriptor': {'Columns': [{'Name': 'paperid', 'Type': 'string'},
{'Name': 'nb_authors', 'Type': 'int'},
{'Name': 'reference_count', 'Type': 'int'},
{'Name': 'citation_count', 'Type': 'int'},
{'Name': 'cited_by_total', 'Type': 'int'},
{'Name': 'is_open_access', 'Type': 'boolean'},
{'Name': 'total_paper', 'Type': 'int'},
{'Name': 'esg', 'Type': 'int'},
{'Name': 'pct_esg', 'Type': 'float'},
{'Name': 'paper_name', 'Type': 'string'},
{'Name': 'female', 'Type': 'float'},
{'Name': 'male', 'Type': 'float'},
{'Name': 'unknown', 'Type': 'float'},
{'Name': 'pct_female', 'Type': 'float'},
{'Name': 'to_remove', 'Type': 'string'},
{'Name': 'id', 'Type': 'int'},
{'Name': 'image', 'Type': 'string'},
{'Name': 'row_id_excel', 'Type': 'string'},
{'Name': 'row_id_google_spreadsheet', 'Type': 'string'},
{'Name': 'table_refer', 'Type': 'string'},
{'Name': 'incremental_id', 'Type': 'int'},
{'Name': 'publication_name', 'Type': 'string'},
{'Name': 'rank', 'Type': 'int'},
{'Name': 'sjr', 'Type': 'int'},
{'Name': 'sjr_best_quartile', 'Type': 'string'},
{'Name': 'h_index', 'Type': 'int'},
{'Name': 'total_docs_2020', 'Type': 'int'},
{'Name': 'total_docs_3years', 'Type': 'int'},
{'Name': 'total_refs', 'Type': 'int'},
{'Name': 'total_cites_3years', 'Type': 'int'},
{'Name': 'citable_docs_3years', 'Type': 'int'},
{'Name': 'cites_doc_2years', 'Type': 'int'},
{'Name': 'country', 'Type': 'string'},
{'Name': 'publication_year', 'Type': 'int'},
{'Name': 'publication_type', 'Type': 'string'},
{'Name': 'cnrs_ranking', 'Type': 'int'},
{'Name': 'peer_reviewed', 'Type': 'string'},
{'Name': 'study_focused_on_social_environmental_behaviour',
'Type': 'string'},
{'Name': 'type_of_data', 'Type': 'string'},
{'Name': 'regions', 'Type': 'string'},
{'Name': 'study_focusing_on_developing_or_developed_countries',
'Type': 'string'},
{'Name': 'first_date_of_observations', 'Type': 'int'},
{'Name': 'mid_year', 'Type': 'int'},
{'Name': 'last_date_of_observations', 'Type': 'int'},
{'Name': 'kyoto', 'Type': 'string'},
{'Name': 'financial_crisis', 'Type': 'string'},
{'Name': 'windows', 'Type': 'int'},
{'Name': 'adjusted_model_name', 'Type': 'string'},
{'Name': 'adjusted_model', 'Type': 'string'},
{'Name': 'dependent', 'Type': 'string'},
{'Name': 'adjusted_dependent', 'Type': 'string'},
{'Name': 'independent', 'Type': 'string'},
{'Name': 'adjusted_independent', 'Type': 'string'},
{'Name': 'social', 'Type': 'string'},
{'Name': 'environmental', 'Type': 'string'},
{'Name': 'governance', 'Type': 'string'},
{'Name': 'sign_of_effect', 'Type': 'string'},
{'Name': 'target', 'Type': 'string'},
{'Name': 'p_value_significant', 'Type': 'string'},
{'Name': 'sign_positive', 'Type': 'string'},
{'Name': 'sign_negative', 'Type': 'string'},
{'Name': 'lag', 'Type': 'string'},
{'Name': 'interaction_term', 'Type': 'string'},
{'Name': 'quadratic_term', 'Type': 'string'},
{'Name': 'n', 'Type': 'int'},
{'Name': 'r2', 'Type': 'int'},
{'Name': 'beta', 'Type': 'int'},
{'Name': 'test_standard_error', 'Type': 'string'},
{'Name': 'test_p_value', 'Type': 'string'},
{'Name': 'test_t_value', 'Type': 'string'},
{'Name': 'adjusted_standard_error', 'Type': 'int'},
{'Name': 'adjusted_t_value', 'Type': 'int'}],
'Location': 's3://datalake-london/DATA/FINANCE/ESG/ESG_CFP/',
'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
'Compressed': False,
'NumberOfBuckets': -1,
'SerdeInfo': {'SerializationLibrary': 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe',
'Parameters': {'field.delim': ','}},
'BucketColumns': [],
'SortColumns': [],
'Parameters': {'CrawlerSchemaDeserializerVersion': '1.0',
'CrawlerSchemaSerializerVersion': '1.0',
'UPDATED_BY_CRAWLER': 'crawl-industry-name',
'areColumnsQuoted': 'false',
'averageRecordSize': '988',
'classification': 'csv',
'columnsOrdered': 'true',
'compressionType': 'none',
'delimiter': ',',
'exclusions': '["s3://datalake-london/DATA/FINANCE/ESG/ESG_CFP/*.csv.metadata"]',
'objectCount': '1',
'recordCount': '618',
'sizeKey': '610824',
'skip.header.line.count': '1',
'typeOfData': 'file'},
'StoredAsSubDirectories': False},
'PartitionKeys': [],
'TableType': 'EXTERNAL_TABLE',
'Parameters': {'CrawlerSchemaDeserializerVersion': '1.0',
'CrawlerSchemaSerializerVersion': '1.0',
'UPDATED_BY_CRAWLER': 'crawl-industry-name',
'areColumnsQuoted': 'false',
'averageRecordSize': '988',
'classification': 'csv',
'columnsOrdered': 'true',
'compressionType': 'none',
'delimiter': ',',
'exclusions': '["s3://datalake-london/DATA/FINANCE/ESG/ESG_CFP/*.csv.metadata"]',
'objectCount': '1',
'recordCount': '618',
'sizeKey': '610824',
'skip.header.line.count': '1',
'typeOfData': 'file'},
'CreatedBy': 'arn:aws:sts::468786073381:assumed-role/AWSGlueServiceRole-crawler-datalake/AWS-Crawler',
'IsRegisteredWithLakeFormation': False,
'CatalogId': '468786073381'}
query_count = """
SELECT COUNT(*) AS CNT
FROM {}.{}
""".format(DatabaseName, table_name)
output = s3.run_query(
query=query_count,
database=DatabaseName,
s3_output=s3_output_example,
filename = 'count_{}'.format(table_name)
)
output
| CNT | |
|---|---|
| 0 | 641 |
This step is mandatory to validate the query in the ETL.
The query is saved in the S3 (bucket datalake-london), but the comments are not available. Use the functions below to update the catalogue and Github
Update the dictionary
Update schema
If automatic = False in automatic_update, then the function returns only the variables to update the comments. Manually add the comment, then, pass the new schema (only the missing comment) to the argument new_schema.
To update the schema, please use the following structure
schema = [
{
"Name": "VAR1",
"Type": "",
"Comment": ""
},
{
"Name": "VAR2",
"Type": "",
"Comment": ""
}
]
%load_ext autoreload
%autoreload 2
import sys
sys.path.append(os.path.join(parent_path, 'utils'))
import make_toc
import create_schema
import create_report
import update_glue_github
The function below manages everything automatically. If the final table comes from more than one query, then pass a list of table in list_tables instead of automatic
list_input, schema = update_glue_github.automatic_update(
list_tables = 'automatic',
automatic= True,
new_schema = None, ### override schema
client = client,
TableName = table_name,
query = query)
description = """
Create table with journal information, papers and coefficients for the meta analysis
"""
name_json = 'parameters_ETL_esg_metadata.json'
partition_keys = ["id", 'incremental_id']
notebookname = "00_meta_analysis.ipynb"
dic_information = {
"client":client,
'bucket':bucket,
's3_output':s3_output,
'DatabaseName':DatabaseName,
'TableName':table_name,
'name_json':name_json,
'partition_keys':partition_keys,
'notebookname':notebookname,
'index_final_table':[0],
'if_final': 'True',
'schema':schema,
'description':description,
'query':query,
"list_input":list_input,
'list_input_automatic':True
}
update_glue_github.update_glue_github(client = client,dic_information = dic_information)
Currently, the ETL has 1 tables
One of the most important step when creating a table is to check if the table contains duplicates. The cell below checks if the table generated before is empty of duplicates. The code uses the JSON file to create the query parsed in Athena.
You are required to define the group(s) that Athena will use to compute the duplicate. For instance, your table can be grouped by COL1 and COL2 (need to be string or varchar), then pass the list ['COL1', 'COL2']
update_glue_github.find_duplicates(
client = client,
bucket = bucket,
name_json = name_json,
partition_keys = partition_keys,
TableName= table_name
)
| CNT | CNT_DUPLICATE | |
|---|---|---|
| 0 | 1 | 641 |
update_glue_github.count_missing(client = client, name_json = name_json, bucket = bucket,TableName = table_name)
| total_missing | total_missing_pct | |
|---|---|---|
| nb_obs | 641 | 100.00% |
| adjusted_t_value | 117 | 18.25% |
| r2 | 101 | 15.76% |
| adjusted_standard_error | 88 | 13.73% |
| cites_doc_2years | 80 | 12.48% |
| citable_docs_3years | 80 | 12.48% |
| total_cites_3years | 80 | 12.48% |
| total_refs | 80 | 12.48% |
| total_docs_3years | 80 | 12.48% |
| total_docs_2020 | 80 | 12.48% |
| h_index | 80 | 12.48% |
| sjr | 80 | 12.48% |
| rank | 80 | 12.48% |
| n | 9 | 1.40% |
| mid_year | 6 | 0.94% |
| first_date_of_observations | 6 | 0.94% |
| last_date_of_observations | 6 | 0.94% |
| windows | 6 | 0.94% |
| cnrs_ranking | 4 | 0.62% |
| beta | 4 | 0.62% |
| governance | 0 | 0.00% |
| test_t_value | 0 | 0.00% |
| kyoto | 0 | 0.00% |
| financial_crisis | 0 | 0.00% |
| test_p_value | 0 | 0.00% |
| adjusted_model_name | 0 | 0.00% |
| adjusted_model | 0 | 0.00% |
| dependent | 0 | 0.00% |
| adjusted_dependent | 0 | 0.00% |
| test_standard_error | 0 | 0.00% |
| quadratic_term | 0 | 0.00% |
| environmental | 0 | 0.00% |
| interaction_term | 0 | 0.00% |
| lag | 0 | 0.00% |
| sign_negative | 0 | 0.00% |
| sign_positive | 0 | 0.00% |
| independent | 0 | 0.00% |
| p_value_significant | 0 | 0.00% |
| target | 0 | 0.00% |
| sign_of_effect | 0 | 0.00% |
| social | 0 | 0.00% |
| adjusted_independent | 0 | 0.00% |
| nb_authors | 0 | 0.00% |
| study_focusing_on_developing_or_developed_countries | 0 | 0.00% |
| regions | 0 | 0.00% |
| citation_count | 0 | 0.00% |
| cited_by_total | 0 | 0.00% |
| is_open_access | 0 | 0.00% |
| total_paper | 0 | 0.00% |
| esg | 0 | 0.00% |
| pct_esg | 0 | 0.00% |
| paper_name | 0 | 0.00% |
| female | 0 | 0.00% |
| male | 0 | 0.00% |
| unknown | 0 | 0.00% |
| pct_female | 0 | 0.00% |
| to_remove | 0 | 0.00% |
| id | 0 | 0.00% |
| image | 0 | 0.00% |
| row_id_excel | 0 | 0.00% |
| row_id_google_spreadsheet | 0 | 0.00% |
| table_refer | 0 | 0.00% |
| incremental_id | 0 | 0.00% |
| publication_name | 0 | 0.00% |
| sjr_best_quartile | 0 | 0.00% |
| country | 0 | 0.00% |
| publication_year | 0 | 0.00% |
| publication_type | 0 | 0.00% |
| paperid | 0 | 0.00% |
| reference_count | 0 | 0.00% |
| study_focused_on_social_environmental_behaviour | 0 | 0.00% |
| type_of_data | 0 | 0.00% |
| peer_reviewed | 0 | 0.00% |
The data catalog is available in Glue. Although, we might want to get a quick access to the tables in Github. In this part, we are generating a README.md in the folder 00_data_catalogue. All tables used in the project will be added to the catalog. We use the ETL parameter file and the schema in Glue to create the README.
Bear in mind the code will erase the previous README.
create_schema.make_data_schema_github(name_json = name_json)
In this part, we are providing basic summary statistic. Since we have created the tables, we can parse the schema in Glue and use our json file to automatically generates the analysis.
The cells below execute the job in the key ANALYSIS. You need to change the primary_key and secondary_key
For a full analysis of the table, please use the following Lambda function. Be patient, it can takes between 5 to 30 minutes. Times varies according to the number of columns in your dataset.
Use the function as follow:
output_prefix: s3://datalake-datascience/ANALYTICS/OUTPUT/TABLE_NAME/region: region where the table is storedbucket: Name of the bucketDatabaseName: Name of the databasetable_name: Name of the tablegroup: variables name to group to count the duplicatesprimary_key: Variable name to perform the grouping -> Only one variable for nowsecondary_key: Variable name to perform the secondary grouping -> Only one variable for nowproba: Chi-square analysis probabilitityy_var: Continuous target variablesCheck the job processing in Sagemaker: https://eu-west-3.console.aws.amazon.com/sagemaker/home?region=eu-west-3#/processing-jobs
The notebook is available: https://s3.console.aws.amazon.com/s3/buckets/datalake-datascience?region=eu-west-3&prefix=ANALYTICS/OUTPUT/&showversions=false
Please, download the notebook on your local machine, and convert it to HTML:
cd "/Users/thomas/Downloads/Notebook"
aws s3 cp s3://datalake-datascience/ANALYTICS/OUTPUT/asif_unzip_data_csv/Template_analysis_from_lambda-2020-11-22-08-12-20.ipynb .
## convert HTML no code
jupyter nbconvert --no-input --to html Template_analysis_from_lambda-2020-11-21-14-30-45.ipynb
jupyter nbconvert --to html Template_analysis_from_lambda-2020-11-22-08-12-20.ipynb
Then upload the HTML to: https://s3.console.aws.amazon.com/s3/buckets/datalake-datascience?region=eu-west-3&prefix=ANALYTICS/HTML_OUTPUT/
Add a new folder with the table name in upper case
import boto3
key, secret_ = con.load_credential()
client_lambda = boto3.client(
'lambda',
aws_access_key_id=key,
aws_secret_access_key=secret_,
region_name = region)
primary_key = ''
secondary_key = ''
y_var = ''
payload = {
"input_path": "s3://datalake-datascience/ANALYTICS/TEMPLATE_NOTEBOOKS/template_analysis_from_lambda.ipynb",
"output_prefix": "s3://datalake-datascience/ANALYTICS/OUTPUT/{}/".format(table_name.upper()),
"parameters": {
"region": "{}".format(region),
"bucket": "{}".format(bucket),
"DatabaseName": "{}".format(DatabaseName),
"table_name": "{}".format(table_name),
"group": "{}".format(','.join(partition_keys)),
"keys": "{},{}".format(primary_key,secondary_key),
"y_var": "{}".format(y_var),
"threshold":0
},
}
payload
#response = client_lambda.invoke(
# FunctionName='RunNotebook',
# InvocationType='RequestResponse',
# LogType='Tail',
# Payload=json.dumps(payload),
#)
#response
import os, time, shutil, urllib, ipykernel, json
from pathlib import Path
from notebook import notebookapp
create_report.create_report(extension = "html", keep_code = True, notebookname = notebookname)
create_schema.create_schema(name_json, path_save_image = os.path.join(parent_path, 'utils'))
### Update TOC in Github
for p in [parent_path,
str(Path(path).parent),
os.path.join(str(Path(path).parent), "00_download_data"),
#os.path.join(str(Path(path).parent.parent), "02_data_analysis"),
#os.path.join(str(Path(path).parent.parent), "02_data_analysis", "00_statistical_exploration"),
#os.path.join(str(Path(path).parent.parent), "02_data_analysis", "01_model_estimation"),
]:
try:
os.remove(os.path.join(p, 'README.md'))
except:
pass
path_parameter = os.path.join(parent_path,'utils', name_json)
md_lines = make_toc.create_index(cwd = p, path_parameter = path_parameter)
md_out_fn = os.path.join(p,'README.md')
if p == parent_path:
make_toc.replace_index(md_out_fn, md_lines, Header = os.path.basename(p).replace('_', ' '), add_description = True, path_parameter = path_parameter)
else:
make_toc.replace_index(md_out_fn, md_lines, Header = os.path.basename(p).replace('_', ' '), add_description = False)